library(tidyverse)
library(here)
library(readr)
library(readxl)
library(data.table)
candy <- read_csv(here("clean_data/candy_clean.csv"),
# specifying data types
col_types = cols(gender = col_character(), country = col_character()))
# 1/3 for illustration used
"raw_data/boing-boing-candy-2015.xlsx" %>%
here() %>%
read_excel() %>%
data.table()
candy %>%
data.table()
candy %>%
# filtering out NAs
filter(!is.na(rating)) %>%
nrow()
## [1] 757108
candy %>%
group_by(going_out) %>%
# Rounding to whole years
summarise(avg_age = round(mean(age, na.rm = TRUE))) %>%
# Yes first
arrange(desc(going_out))
## `summarise()` ungrouping output (override with `.groups` argument)
candy %>%
# Filtering out NAs
filter(!is.na(rating)) %>%
group_by(rating, candy) %>%
summarise(
count = n()
) %>%
filter(count == max(count))
## `summarise()` regrouping output by 'rating' (override with `.groups` argument)
candy %>%
filter(candy == "starburst") %>%
filter(rating == "despair") %>%
summarise(id_count = n_distinct(person_id))
For the next two questions, count despair as -1, joy as +1 and meh as 0.
candy <-
candy %>%
mutate(rating_numeric = recode(rating,
joy = 1,
despair = -1,
meh = 0))
candy %>%
group_by(gender, candy) %>%
summarise(
avg_rating = mean(rating_numeric, na.rm = TRUE)
) %>%
filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'gender' (override with `.groups` argument)
candy %>%
group_by(year, candy) %>%
summarise(avg_rating = mean(rating_numeric, na.rm = TRUE)) %>%
filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
candy %>%
mutate(country = if_else(country %in% c("united states",
"canada",
"united kingdom"),
country, "other")) %>%
group_by(country, candy) %>%
summarise(avg_rating = mean(rating_numeric, na.rm = TRUE)) %>%
filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'country' (override with `.groups` argument)